df <- read.table("emissionssw.dat", header=TRUE)
head(df)
## nox noxem ws humidity
## 1 170.25 1580.1338 0.9000 0.3195116
## 2 314.40 3248.0421 1.0500 0.4041845
## 3 270.15 3207.5551 0.9665 0.3715211
## 4 177.65 2798.4168 1.0335 0.3065146
## 5 168.00 3181.8449 2.0165 0.2604632
## 6 75.10 270.7519 1.5170 0.2102720
Quite a few outliers across all time series. Consider quantile clipping to prevent influence of these observations? Maybe even remove from data and don’t bother trying to predict
plot.ts(df)
lapply(names(df), function(col) {
acf(df[[col]], main = paste("ACF for", col))
})
## [[1]]
##
## Autocorrelations of series 'df[[col]]', by lag
##
## 0 1 2 3 4 5 6 7 8 9 10 11 12
## 1.000 0.448 0.230 0.199 0.207 0.238 0.184 0.143 0.124 0.149 0.122 0.068 0.046
## 13 14 15 16 17 18 19 20 21 22 23 24 25
## 0.020 0.031 0.060 0.035 0.046 0.076 0.045 0.035 0.052 0.071 0.067 0.081 0.053
## 26 27 28 29 30 31 32 33
## 0.034 0.030 0.044 0.062 0.053 0.063 0.072 0.104
##
## [[2]]
##
## Autocorrelations of series 'df[[col]]', by lag
##
## 0 1 2 3 4 5 6 7 8 9 10
## 1.000 0.496 0.043 -0.145 -0.109 0.040 0.122 0.135 0.076 0.015 -0.039
## 11 12 13 14 15 16 17 18 19 20 21
## -0.054 -0.039 -0.026 0.020 0.033 0.012 -0.013 -0.037 -0.053 -0.038 -0.005
## 22 23 24 25 26 27 28 29 30 31 32
## 0.021 -0.001 -0.040 -0.051 -0.034 -0.027 -0.010 0.004 0.012 0.000 0.002
## 33
## 0.039
##
## [[3]]
##
## Autocorrelations of series 'df[[col]]', by lag
##
## 0 1 2 3 4 5 6 7 8 9 10
## 1.000 0.616 0.385 0.273 0.223 0.202 0.159 0.143 0.092 0.058 0.033
## 11 12 13 14 15 16 17 18 19 20 21
## 0.009 0.018 0.003 0.010 -0.003 -0.015 -0.002 -0.007 -0.013 -0.016 -0.011
## 22 23 24 25 26 27 28 29 30 31 32
## -0.006 -0.007 0.010 0.042 0.046 0.017 0.001 0.023 0.031 0.054 0.082
## 33
## 0.115
##
## [[4]]
##
## Autocorrelations of series 'df[[col]]', by lag
##
## 0 1 2 3 4 5 6 7 8 9 10
## 1.000 0.751 0.571 0.443 0.359 0.260 0.189 0.137 0.086 0.052 0.024
## 11 12 13 14 15 16 17 18 19 20 21
## 0.012 0.003 -0.005 -0.007 -0.006 -0.012 -0.012 -0.007 -0.006 -0.014 -0.018
## 22 23 24 25 26 27 28 29 30 31 32
## -0.015 -0.012 -0.003 0.009 0.025 0.009 -0.002 -0.017 -0.023 -0.033 -0.035
## 33
## -0.032
lag 1/5 have the highest autocorrelations, with the rest being lower. Makes sense given martingale property + same time of day at lag-5 mark. Since lag 1 has such high autocorrelation, may be worth trying a model fitted on first order differences.
pairs(df)
m1 <- lm(nox ~ ., data=df)
summary(m1)
##
## Call:
## lm(formula = nox ~ ., data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -166.19 -42.65 -15.38 20.03 500.14
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 92.947559 3.699776 25.122 <2e-16 ***
## noxem 0.036152 0.001077 33.573 <2e-16 ***
## ws -27.338040 1.113436 -24.553 <2e-16 ***
## humidity -7.227542 5.705300 -1.267 0.205
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 73.89 on 2018 degrees of freedom
## Multiple R-squared: 0.437, Adjusted R-squared: 0.4361
## F-statistic: 522.1 on 3 and 2018 DF, p-value: < 2.2e-16
plot(m1)
looks like a quadratic u shaped relationship between residuals and
fitted. Let’s try squaring humidity since that was insignificant and
looked nonlinear
m2 <- lm(nox ~. + I(humidity^2), data=df)
summary(m2)
##
## Call:
## lm(formula = nox ~ . + I(humidity^2), data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -166.23 -42.65 -15.38 20.04 500.16
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 92.892893 3.898850 23.826 <2e-16 ***
## noxem 0.036153 0.001077 33.560 <2e-16 ***
## ws -27.340544 1.115129 -24.518 <2e-16 ***
## humidity -6.758165 11.982859 -0.564 0.573
## I(humidity^2) -0.202849 4.553608 -0.045 0.964
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 73.9 on 2017 degrees of freedom
## Multiple R-squared: 0.437, Adjusted R-squared: 0.4359
## F-statistic: 391.4 on 4 and 2017 DF, p-value: < 2.2e-16
plot(m2)
Didn’t help, what about windspeed since that was nonlinear too. Still
seeing a larger fitted value gives larger variance. Polynomials degree
two don’t seem to work, though there is some improvement. Common theme
is that humidity doesn’t seem to be important.
m3 <- lm(nox ~ (noxem + ws)^2 + I(ws^2) + I(noxem^2), data=df)
summary(m3)
##
## Call:
## lm(formula = nox ~ (noxem + ws)^2 + I(ws^2) + I(noxem^2), data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -174.81 -36.73 -10.08 21.66 471.48
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.135e+01 6.254e+00 11.409 < 2e-16 ***
## noxem 6.757e-02 4.417e-03 15.297 < 2e-16 ***
## ws -3.285e+01 3.503e+00 -9.378 < 2e-16 ***
## I(ws^2) 3.278e+00 3.920e-01 8.361 < 2e-16 ***
## I(noxem^2) -3.150e-06 8.738e-07 -3.605 0.00032 ***
## noxem:ws -7.445e-03 7.136e-04 -10.434 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 70.05 on 2016 degrees of freedom
## Multiple R-squared: 0.4944, Adjusted R-squared: 0.4931
## F-statistic: 394.2 on 5 and 2016 DF, p-value: < 2.2e-16
plot(m3)
# try fitted WLS to this, didn't seem to help
m3w <- lm(nox ~ (noxem + ws)^2 + I(ws^2) + I(noxem^2), data=df, weights = 1/abs(resid(m3)))
summary(m3w)
##
## Call:
## lm(formula = nox ~ (noxem + ws)^2 + I(ws^2) + I(noxem^2), data = df,
## weights = 1/abs(resid(m3)))
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -12.977 -5.670 -2.669 5.045 21.944
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.932e+01 1.605e+00 43.18 <2e-16 ***
## noxem 6.475e-02 1.199e-03 53.99 <2e-16 ***
## ws -3.148e+01 8.137e-01 -38.69 <2e-16 ***
## I(ws^2) 3.138e+00 9.492e-02 33.06 <2e-16 ***
## I(noxem^2) -2.692e-06 2.304e-07 -11.68 <2e-16 ***
## noxem:ws -7.278e-03 1.219e-04 -59.72 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.816 on 2016 degrees of freedom
## Multiple R-squared: 0.9403, Adjusted R-squared: 0.9401
## F-statistic: 6350 on 5 and 2016 DF, p-value: < 2.2e-16
plot(m3w)
More insights from question: - measurements sequential over one year, sorted by day,time. Need to take this dependence into account - hypothesis: noxem/ws are important, noxem is the source, ws is the dispersion. Humidity may have an impact, but so far it’s not showing.
General: - train/test set, CV? for model selection/evaluating performance while checking for overfitting
Dealing with time dependence: - adding lagged/smoothed/rolling mean versions of variables - adding time index/seasonal predictor
Tried first order differencing, not really any that much better than the striaght linear model it seems. Still the same U-shaped residual problem.
df["dy"] <- c(NaN, diff(df$nox))
m4 <- lm(dy ~ noxem + ws + humidity, data=df)
summary(m4)
##
## Call:
## lm(formula = dy ~ noxem + ws + humidity, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -534.62 -40.51 3.74 40.56 457.49
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -13.986477 4.930561 -2.837 0.0046 **
## noxem 0.017917 0.001435 12.488 <2e-16 ***
## ws -13.052331 1.483683 -8.797 <2e-16 ***
## humidity 4.981172 7.601862 0.655 0.5124
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 98.44 on 2017 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.09451, Adjusted R-squared: 0.09316
## F-statistic: 70.17 on 3 and 2017 DF, p-value: < 2.2e-16
plot(m4)